# Author: Mark Richardson
# Date started: 03/03/2021
# Date finished: 02/07/2022
# Purpose: Estimate performance ratings
# Date revised: 06/09/2022
# Revision purpose: Save a ratings-level data set to examine partisan bias

# Date revised: 10/12/2022
# Revision purpose: Add a hierarchical model that uses all ratings
# Proofed 11/05/2023

# Date revised: 11/16/2023
# Revision purpose: Add a 5th chain to non-hierarchical ratings

# Date revised: 02/20/2024
# Revision purpose: Log R output


# Open a log
library(logr)

lf <- log_open(file_name = "02_performance_ratings_estimation.log")

log_code()

# Load packages
library(dplyr)
library(ggplot2)

library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

library(stringr)

# Load data
load("data/ratings/performance_ratings/01_performance_ratings_for_model.RData")

#### Create a rating-level data frame ####

perf_stan <- list()

for (i in 1:5) {
  
  var_num <- paste0("_", i)
  
  perf_stan[[i]] <- perf_ratings %>%
    select(id:office, contains(var_num)) %>%
    rename("agency_rated" = paste0("agency_rated", var_num),
           "perf_rating_org" = paste0("perf_rating", var_num, "_org"),
           "perf_rating" = paste0("perf_rating", var_num))
  
}

perf_stan <- bind_rows(perf_stan) %>%
  tidyr::drop_na(perf_rating) # Drop rows with no rating

rm(i, var_num)

#### Wrangle the ratings ####

# Calculate ratings per agency
perf_stan <- perf_stan %>%
  group_by(agency_rated) %>%
  mutate(n_ratings_agency = n()) %>%
  ungroup()

table(perf_stan$n_ratings_agency)

# Calculate number of ratings per rater
perf_stan <- perf_stan %>%
  group_by(id) %>%
  mutate(n_ratings_rater = n()) %>%
  ungroup()

table(perf_stan$n_ratings_rater)

#### Prepare data for stan ####

# Create an dept index and get raters per dept

dept_data <- perf_stan %>%
  distinct(dept, id) %>%
  group_by(dept) %>%
  mutate(n_raters_dept = n()) %>%
  ungroup() %>%
  distinct(dept, n_raters_dept) %>%
  arrange(dept) %>%
  mutate(dept_index = 1:n())

# Create an agency index

agency_data <- perf_stan %>%
  distinct(agency_rated) %>%
  arrange(agency_rated) %>%
  mutate(agency_index = 1:n())

# Create a rater_index

rater_data <- perf_stan %>%
  distinct(id) %>%
  mutate(rater_index = 1:n())

# Join indices with ratings-level data

n <- nrow(perf_stan)

perf_stan <- perf_stan %>%
  full_join(agency_data, by = "agency_rated") %>%
  full_join(rater_data, by = "id") %>%
  full_join(dept_data, by = "dept")

nrow(perf_stan) == n # No rows added

rm(n)

# Get rater and dept index combinations for non-centered parameterization

dept_rater_grp <- perf_stan %>%
  select(rater_index, dept_index) %>%
  distinct() %>%
  arrange(rater_index, dept_index)

# Prepare informed ratings

agency_data <- agency_data %>%
  left_join(perf_inf_priors, by = "agency_rated") %>%
  rename(n_inf_ratings = n)

# Set informed prior for cases with few ratings
# Set standard deviation for cases with 5 or fewer informed ratings
# to max observed standard deviation

agency_data <- agency_data %>%
  mutate(perf_inf_se = sqrt(perf_inf_var / n_inf_ratings),
         perf_inf_sd = sqrt(perf_inf_var),
         perf_inf_mean_stan = if_else(is.na(perf_inf_mean), 0, perf_inf_mean),
         perf_inf_sd_stan = if_else(n_inf_ratings <= 5 | is.na(perf_inf_sd),
                                    max(perf_inf_sd, na.rm = TRUE), perf_inf_sd))

#### Assemble the data for Stan ####

for_stan <- list(y        = perf_stan$perf_rating, # Vector of ratings
                 N        = length(perf_stan$perf_rating), # Number of ratings
                 aa       = perf_stan$agency_index, # Vector of agency indexes
                 A        = length(unique(perf_stan$agency_index)), # Number of agencies
                 rr       = perf_stan$rater_index, # Rater index
                 R        = length(unique(perf_stan$rater_index)), # Number of raters
                 dd       = dept_rater_grp$dept_index, # Dept index (for mapping raters to departments)
                 D        = length(unique(perf_stan$dept_index)), # Number of departments
                 mu_prior = agency_data %>% arrange(agency_index) %>% pull(perf_inf_mean_stan), # Vector of informed means
                 sd_prior = agency_data %>% arrange(agency_index) %>% pull(perf_inf_sd_stan), # Vector of informed standard deviations
                 sd_naive = 1.5) # Standard deviation for the naive prior

#### Fit the models ####

#### Informed hierarchical model ####

perf_inf_sd_hier <- stan(file = "code/ratings/stan_models/ratings_model_informed_sd_hierarchical_multi.stan",
                         data = for_stan,
                         chains = 5,
                         warmup = 1000,
                         iter = 4000,
                         cores = 5,
                         refresh = 1000,
                         init = list(chain_1 = list(x = rep(-3, times = for_stan$A)),
                                     chain_2 = list(x = rep(-2, times = for_stan$A)),
                                     chain_3 = list(x = rep( 0, times = for_stan$A)),
                                     chain_4 = list(x = rep( 2, times = for_stan$A)),
                                     chain_5 = list(x = rep( 3, times = for_stan$A))),
                         seed = 31031617,
                         control = list(adapt_delta = 0.99,
                                        max_treedepth = 20,
                                        stepsize = 0.001),
                         diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_informed_sd_hier.csv")

get_elapsed_time(perf_inf_sd_hier) / 60^2

check_treedepth(perf_inf_sd_hier)
check_energy(perf_inf_sd_hier)
check_divergences(perf_inf_sd_hier)

save.image(file = "data/ratings/performance_ratings/saves_during_estimation/perf_in_process.RData")

#### Naive hierarchical model ####

perf_naive_sd_hier <- stan(file = "code/ratings/stan_models/ratings_model_naive_hierarchical_multi.stan",
                           data = for_stan,
                           chains = 5,
                           warmup = 1000,
                           iter = 4000,
                           cores = 5,
                           refresh = 1000,
                           init = list(chain_1 = list(x = rep(-3, times = for_stan$A)),
                                       chain_2 = list(x = rep(-2, times = for_stan$A)),
                                       chain_3 = list(x = rep( 0, times = for_stan$A)),
                                       chain_4 = list(x = rep( 2, times = for_stan$A)),
                                       chain_5 = list(x = rep( 3, times = for_stan$A))),
                           seed = 31031617,
                           control = list(adapt_delta = 0.99,
                                          max_treedepth = 20,
                                          stepsize = 0.001),
                           diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_naive_sd_hier.csv")

get_elapsed_time(perf_naive_sd_hier) / 60^2

check_treedepth(perf_naive_sd_hier)
check_energy(perf_naive_sd_hier)
check_divergences(perf_naive_sd_hier)

save.image(file = "data/ratings/performance_ratings/saves_during_estimation/perf_in_process.RData")

#### Informed model ####

perf_inf_sd <- stan(file = "code/ratings/stan_models/ratings_model_informed_sd.stan",
               data = for_stan,
               chains = 5,
               warmup = 1000,
               iter = 4000,
               cores = 5,
               refresh = 1000,
               init = list(chain_1 = list(x = rep(-3, times = for_stan$A)),
                           chain_2 = list(x = rep(-2, times = for_stan$A)),
                           chain_3 = list(x = rep( 0, times = for_stan$A)),
                           chain_4 = list(x = rep( 2, times = for_stan$A)),
                           chain_5 = list(x = rep( 3, times = for_stan$A))),
               seed = 31031617,
               control = list(adapt_delta = 0.90,
                              max_treedepth = 20,
                              stepsize = 0.001),
               diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_informed_sd.csv")

get_elapsed_time(perf_inf_sd) / 60^2

check_treedepth(perf_inf_sd)
check_energy(perf_inf_sd)
check_divergences(perf_inf_sd)

#### Naive model ####

perf_naive <- stan(file = "code/ratings/stan_models/ratings_model_naive.stan",
                   data = for_stan,
                   chains = 5,
                   warmup = 1000,
                   iter = 4000,
                   cores = 5,
                   refresh = 1000,
                   init = list(chain_1 = list(x = rep(-3, times = for_stan$A)),
                               chain_2 = list(x = rep(-2, times = for_stan$A)),
                               chain_3 = list(x = rep( 0, times = for_stan$A)),
                               chain_4 = list(x = rep( 2, times = for_stan$A)),
                               chain_5 = list(x = rep( 3, times = for_stan$A))),
                   seed = 31031617,
                   control = list(adapt_delta = 0.90,
                                  max_treedepth = 20,
                                  stepsize = 0.001),
                   diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_naive.csv")

get_elapsed_time(perf_naive) / 60^2

check_treedepth(perf_naive)
check_energy(perf_naive)
check_divergences(perf_naive)

#### Save fitted models ####

save.image("data/ratings/performance_ratings/02_performance_fitted_models.RData")

log_close()